{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "68909c78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bigcode/starcoderplus\", use_auth_token=True)\n",
    "eos_token = tokenizer.eos_token\n",
    "print(eos_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ea9e91c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n",
      "\n",
      "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information.\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "\"\"\"\n",
    "We will combine following datasets:\n",
    "1. Open Assistant Guanaco: For following instructions and converse\n",
    "2. LIMA: For following instructions and converse\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "Format of the dataset:\n",
    "\n",
    "<|system|> system message <|endoftext|> <|prompter|> Q1 <|endoftext|> <|assistant|> A1 <|endoftext|>\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "system_prompt = \"\"\"You are a helpful, respectful and honest assistant. Always answer as helpfully \\\n",
    "as possible, while being safe. Your answers should not include any harmful, \\\n",
    "unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that \\\n",
    "your responses are socially unbiased and positive in nature.\n",
    "\n",
    "If a question does not make any sense, or is not factually coherent, explain why \\\n",
    "instead of answering something not correct. If you don’t know the answer to a \\\n",
    "question, please don’t share false information.\"\"\"\n",
    "\n",
    "print(system_prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "3beab4cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset lima (/raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b9708f65cb8c46fd83ba1689df163559",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-0ecec33228122a06.arrow\n",
      "Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-932634d2b3375595.arrow\n",
      "Loading cached shuffled indices for dataset at /raid/sourab/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/f882fbf63e999e19fc8841fab01c292dd00433ae9bc4f0f177b0b1c484771179/cache-27e5171340f1f4df.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 1030\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 300\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> Are we alone in the universe? <|endoftext|> <|assistant|> Humanity has yet to find evidence for life outside planet Earth.\\n\\nThe current search for extraterrestrial life is largely focused on finding planets that are situated in an \"habitable zone\". Roughly put, a planet is in a habitable zone if its distance from nearby stars allows for liquid water to persist on its surface.\\nSo far, a few such planets have been found, but none yet with liquid water on its surface. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# LIMA dataset processing\n",
    "\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for sample in samples[\"conversations\"]:\n",
    "        formatted_conv = conv_prefix\n",
    "        for i, turn in enumerate(sample):\n",
    "            turn_prefix = \"<|assistant|>\" if (i+1)%2==0 else \"<|prompter|>\"\n",
    "            formatted_conv += f\" {turn_prefix} {turn} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "            \n",
    "    \n",
    "\n",
    "\n",
    "lima = load_dataset(\"GAIR/lima\")\n",
    "lima = lima.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=lima[\"train\"].column_names\n",
    ")\n",
    "\n",
    "lima[\"train\"] = lima[\"train\"].shuffle(100)\n",
    "\n",
    "print(lima)\n",
    "print(lima[\"train\"][0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e7529570",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset json (/raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "54ddeb79705e44debcd41692b68516cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-d71fcbcad60fd547.arrow\n",
      "Loading cached processed dataset at /raid/sourab/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-54124603983709e2.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 9846\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 518\n",
      "    })\n",
      "})\n",
      "{'content': \"<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> Can you explain me how cheats are working? <|endoftext|> <|assistant|> Cheating can be of various types. I will need more information on what type of cheating you are referring to before I can provide you with any information. <|endoftext|> <|prompter|> How do cheats in games like COD and CS:GO work? What goes into them? Please discuss the ethics of cheats as well. <|endoftext|> <|assistant|> Cheats in games like Call of Duty (COD) and Counter-Strike: Global Offensive (CS:GO) typically work by manipulating the game's code to gain an unfair advantage over other players. Cheats can range from simple mods that change the game's appearance to more advanced hacks that provide aimbot, wallhack, or other game-breaking features.\\n\\nTo create cheats, developers typically reverse-engineer the game's code to identify vulnerabilities that can be exploited. They may then create custom code that overrides the game's normal behavior, allowing them to access hidden features, bypass game mechanics, or gain an unfair advantage.\\n\\nEthically, the use of cheats in multiplayer games is generally considered to be cheating and is frowned upon by the gaming community. Cheating can ruin the experience for other players, create an unfair playing field, and damage the integrity of the game.\\n\\nAdditionally, many game developers explicitly prohibit the use of cheats in their games and may take steps to prevent cheating, such as implementing anti-cheat measures or banning players caught using cheats.\\n\\nSome argue that cheating in single-player games is less harmful and can even enhance the player's experience by allowing them to experiment with the game's mechanics in new and creative ways. However, cheating in single-player games can still have negative consequences, such as undermining the player's sense of achievement or spoiling the game's narrative.\\n\\nIn summary, cheats in games like COD and CS:GO are created by manipulating the game's code to gain an unfair advantage over other players. While some argue that cheating in single-player games is less harmful, cheating in multiplayer games is generally considered unethical and can ruin the experience for others. Game developers typically prohibit the use of cheats and may take steps to prevent cheating. <|endoftext|>\"}\n"
     ]
    }
   ],
   "source": [
    "# OpenAsst Guanaco dataset processing\n",
    "\n",
    "tokens = [\"### Human:\", \"### Assistant:\"]\n",
    "\n",
    "import re\n",
    "\n",
    "def split_on_multiple_tokens(input_string, tokens):\n",
    "    # Combine the tokens into a regular expression pattern using the '|' (OR) operator\n",
    "    pattern = '|'.join(re.escape(token) for token in tokens)\n",
    "    \n",
    "    # Split the input string using the generated pattern\n",
    "    split_result = re.split(pattern, input_string)\n",
    "    \n",
    "    # Remove any empty strings resulting from consecutive delimiters\n",
    "    split_result = [part.strip() for part in split_result if part.strip()]\n",
    "    \n",
    "    return split_result\n",
    "\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for sample in samples[\"text\"]:\n",
    "        sample = split_on_multiple_tokens(sample, tokens)\n",
    "        formatted_conv = conv_prefix\n",
    "        for i, turn in enumerate(sample):\n",
    "            turn_prefix = \"<|assistant|>\" if (i+1)%2==0 else \"<|prompter|>\"\n",
    "            formatted_conv += f\" {turn_prefix} {turn} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "            \n",
    "    \n",
    "\n",
    "\n",
    "guanaco = load_dataset(\"timdettmers/openassistant-guanaco\")\n",
    "guanaco = guanaco.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=guanaco[\"train\"].column_names\n",
    ")\n",
    "\n",
    "guanaco[\"train\"] = guanaco[\"train\"].shuffle()\n",
    "\n",
    "print(guanaco)\n",
    "print(guanaco[\"train\"][0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "31f5c4a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['content'],\n",
       "        num_rows: 10876\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['content'],\n",
       "        num_rows: 818\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import concatenate_datasets, DatasetDict\n",
    "full_dataset = DatasetDict({split: concatenate_datasets([lima[split], guanaco[split]]) for split in [\"train\", \"test\"]})\n",
    "full_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "4f960de1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Pushing split train to the Hub.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "507b11c048fb425497271217b59aa2fe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13275b90b5c54e28920989f16b8e58cb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Pushing split test to the Hub.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0b955fb4f6914a2d9fea7815aeda0f47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "93201261746e487a90010ca4c25aef90",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_dataset.push_to_hub(\"code-chat-assistant-v1\", private=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a734789e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
